started: AL26Apr2019
last updated: AL30Aug2019
Input sequencing data: 1,850 vars x 715 samples ( 519BC = 258UBC + 260CBC and 197NFE)
Input eigenvectors for 715 samples (calculated using 1,634 variants not in LD)
Add eigenvectors to phenotypes, make PCA plots and
calculate 19 PC outliers: outside of mean +/- 3*sd
Sys.time()
## [1] "2019-08-30 15:44:57 BST"
rm(list=ls())
graphics.off()
library(knitr)
## Warning: package 'knitr' was built under R version 3.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.2
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
base_folder="/Users/alexey/Documents/wecare/ampliseq/v04_ampliseq_nfe/s15_add_PCs_exclude_outliers"
opts_knit$set(root.dir = base_folder)
options(stringsAsFactors = F)
options(warnPartialMatchArgs = T,
warnPartialMatchAttr = T,
warnPartialMatchDollar = T)
#options(error = browser()) # Type Q or c to exit, drop browser level
# https://support.rstudio.com/hc/en-us/articles/200713843?version=1.1.456&mode=desktop
# https://stackoverflow.com/questions/13052522/how-to-leave-the-r-browser-mode-in-the-console-window/13052588
# Threshold for outlier detection
th <- 3 # mean +/- (th*sd)
# Sequencing data (for wecare phenotypes: case/control status)
source_folder="/Users/alexey/Documents/wecare/ampliseq/v04_ampliseq_nfe/s12_check_BRCA1_BRCA2_PALB2"
load(paste(source_folder, "s03_exclude_BRCA1_BCRA2_PALB2_carriers.RData", sep="/"))
base_folder="/Users/alexey/Documents/wecare/ampliseq/v04_ampliseq_nfe/s15_add_PCs_exclude_outliers"
# Eigenvectors
eigenvectors_file <- paste(base_folder, "ampliseq_nfe_1634_715_100PCs.eigenvec", sep="/")
eigenvectors.df <- read.table(eigenvectors_file, header=T, sep="\t",quote="")
# Eigenvalues
eigenvalues_file <- paste(base_folder, "ampliseq_nfe_1634_715_100PCs.eigenval", sep="/")
eigenvalues.df <- read.table(eigenvalues_file, header=F, sep="\t",quote="")
# Clean-up
rm(source_folder, eigenvectors_file, eigenvalues_file)
# List of objects
ls()
## [1] "base_folder" "eigenvalues.df" "eigenvectors.df" "genotypes.mx" "phenotypes.df" "th" "variants.df"
# Dimentions of objects
dim(eigenvectors.df)
## [1] 715 102
dim(eigenvalues.df)
## [1] 100 1
dim(genotypes.mx)
## [1] 1850 715
dim(phenotypes.df)
## [1] 715 24
dim(variants.df)
## [1] 1850 65
# Update eigenvectors
rownames(eigenvectors.df) <- eigenvectors.df$FID
eigenvectors.df <- eigenvectors.df[,-1]
"long_ids" -> colnames(eigenvectors.df)[1]
eigenvectors.df[1:5,1:5]
## long_ids PC1 PC2 PC3 PC4
## 100_S8_L007 100_S8_L007 -0.00993811 -0.006220860 -0.000233260 0.000518659
## 101_S528_L008 101_S528_L008 -0.00812998 -0.003457840 0.000624146 0.002445700
## 102_S466_L008 102_S466_L008 -0.00241840 0.004862620 -0.004549510 -0.028571000
## 103_S147_L007 103_S147_L007 0.03403750 0.045008900 -0.019662100 0.025902100
## 104_S325_L008 104_S325_L008 -0.00373370 0.000680981 -0.000333239 -0.003197130
plot(eigenvalues.df$V1, type="b",
xlab="Eigenvector", ylab="Eigenvalue",
ylim=c(0,15), main="Top 100 eigenvectors")
plot(eigenvalues.df$V1[1:20], type="b",
xlab="Eigenvector", ylab="Eigenvalue",
ylim=c(0,15), main="Top 20 eigenvectors")
rm(eigenvalues.df)
# Add factor column with verbal lables for phenotypes to be used in plot
group <- factor(phenotypes.df$cc, levels = c(0,1,-1), labels = c("UBC","CBC","NFE"))
phenotypes.df <- data.frame(phenotypes.df,group)
table(phenotypes.df$group)
##
## UBC CBC NFE
## 258 260 197
# Add column with sample IDs for the plot
sample_id <- phenotypes.df$Sample_num
sample_id[phenotypes.df$group=="NFE"] <- phenotypes.df$long_ids[phenotypes.df$group=="NFE"]
phenotypes.df <- data.frame(phenotypes.df,sample_id)
head(phenotypes.df$sample_id)
## [1] "100" "101" "102" "103" "104" "105"
tail(phenotypes.df$sample_id)
## [1] "NA20819" "NA20821" "NA20822" "NA20826" "NA20828" "NA20832"
# Add 5 top eigenvectors to phenotypes
phenotypes.df <- full_join(phenotypes.df,eigenvectors.df[,1:6], by="long_ids")
# Prepare colour scale for ggplots
myColours <- c("NFE"="green", "UBC"="blue", "CBC"="red")
myColourScale <- scale_colour_manual(values=myColours)
# Clean-up
rm(group, sample_id, eigenvectors.df, myColours)
pc1_mean <- mean(phenotypes.df$PC1)
pc1_sd <- sd(phenotypes.df$PC1)
pc1_min <- pc1_mean - pc1_sd * th
pc1_max <- pc1_mean + pc1_sd * th
pc2_mean <- mean(phenotypes.df$PC2)
pc2_sd <- sd(phenotypes.df$PC2)
pc2_min <- pc2_mean - pc2_sd * th
pc2_max <- pc2_mean + pc2_sd * th
pc3_mean <- mean(phenotypes.df$PC3)
pc3_sd <- sd(phenotypes.df$PC3)
pc3_min <- pc3_mean - pc3_sd * th
pc3_max <- pc3_mean + pc3_sd * th
pc4_mean <- mean(phenotypes.df$PC4)
pc4_sd <- sd(phenotypes.df$PC4)
pc4_min <- pc4_mean - pc4_sd * th
pc4_max <- pc4_mean + pc4_sd * th
pc5_mean <- mean(phenotypes.df$PC5)
pc5_sd <- sd(phenotypes.df$PC5)
pc5_min <- pc5_mean - pc5_sd * th
pc5_max <- pc5_mean + pc5_sd * th
rm(pc1_mean, pc1_sd,
pc2_mean, pc2_sd,
pc3_mean, pc3_sd,
pc4_mean, pc4_sd,
pc5_mean, pc5_sd,
th)
# Detect outliers on each PC
pc1_outlier <- phenotypes.df$PC1 < pc1_min | phenotypes.df$PC1 > pc1_max
pc2_outlier <- phenotypes.df$PC2 < pc2_min | phenotypes.df$PC2 > pc2_max
pc3_outlier <- phenotypes.df$PC3 < pc3_min | phenotypes.df$PC3 > pc3_max
pc4_outlier <- phenotypes.df$PC4 < pc4_min | phenotypes.df$PC4 > pc4_max
pc5_outlier <- phenotypes.df$PC5 < pc5_min | phenotypes.df$PC5 > pc5_max
# Detect outliers on any PC
pc_outlier <- pc1_outlier | pc2_outlier | pc3_outlier | pc4_outlier | pc5_outlier
# Check counts of outliers
sum(pc1_outlier)
## [1] 17
sum(pc2_outlier)
## [1] 9
sum(pc3_outlier)
## [1] 6
sum(pc4_outlier)
## [1] 9
sum(pc5_outlier)
## [1] 10
sum(pc_outlier)
## [1] 19
# Add outliers data to phenotypes dataframe
phenotypes.df <- data.frame(phenotypes.df, pc_outlier, pc1_outlier, pc2_outlier,
pc3_outlier, pc4_outlier, pc5_outlier)
phenotypes.df %>%
filter(pc_outlier) %>%
select(sample_id, pc_outlier, pc1_outlier, pc2_outlier,
pc3_outlier, pc4_outlier, pc5_outlier) %>%
arrange(as.integer(sample_id))
## sample_id pc_outlier pc1_outlier pc2_outlier pc3_outlier pc4_outlier pc5_outlier
## 1 3 TRUE TRUE TRUE FALSE TRUE TRUE
## 2 16 TRUE TRUE TRUE TRUE TRUE TRUE
## 3 23 TRUE FALSE FALSE FALSE FALSE TRUE
## 4 141 TRUE TRUE TRUE FALSE FALSE FALSE
## 5 148 TRUE TRUE FALSE FALSE TRUE FALSE
## 6 246 TRUE TRUE TRUE FALSE FALSE FALSE
## 7 273 TRUE TRUE FALSE FALSE TRUE TRUE
## 8 275 TRUE TRUE TRUE FALSE TRUE FALSE
## 9 277 TRUE TRUE TRUE TRUE TRUE TRUE
## 10 311 TRUE TRUE FALSE TRUE FALSE TRUE
## 11 323 TRUE TRUE TRUE TRUE TRUE FALSE
## 12 329 TRUE TRUE FALSE FALSE FALSE TRUE
## 13 352 TRUE TRUE FALSE FALSE FALSE FALSE
## 14 355 TRUE FALSE FALSE FALSE FALSE TRUE
## 15 366 TRUE TRUE TRUE TRUE TRUE TRUE
## 16 368 TRUE TRUE FALSE FALSE TRUE FALSE
## 17 375 TRUE TRUE TRUE TRUE FALSE TRUE
## 18 403 TRUE TRUE FALSE FALSE FALSE FALSE
## 19 408 TRUE TRUE FALSE FALSE FALSE FALSE
# Clean-up
rm(pc_outlier, pc1_outlier, pc2_outlier, pc3_outlier, pc4_outlier, pc5_outlier)
# Prepare ggplot
g <- ggplot(phenotypes.df, aes(PC1,PC2)) +
geom_point(aes(col=group, text=sample_id)) +
labs(title="PC1 vs PC2", x="PC1", y="PC2") +
theme(legend.title=element_blank()) + # To suppress the legend title
myColourScale + # otherwise it would be "group"
geom_vline(xintercept=c(pc1_min, pc1_max), linetype="dotted", size=0.2) +
geom_hline(yintercept=c(pc2_min, pc2_max), linetype="dotted", size=0.2)
## Warning: Ignoring unknown aesthetics: text
# Draw static ggplot
#g
# Draw dynamic ggplotly
ggplotly(g, tooltip="text") # By default the tooltip would also show coordinates
## Warning in dev_fun(file = tempfile(), width = width %||% 640, height = height %||% : partial argument match of 'file' to 'filename'
# Clean-up
rm(g)
# Prepare ggplot
g <- ggplot(phenotypes.df, aes(PC2,PC3)) +
geom_point(aes(col=group, text=sample_id)) +
labs(title="PC2 vs PC3", x="PC2", y="PC3") +
theme(legend.title=element_blank()) +
myColourScale +
geom_vline(xintercept=c(pc2_min, pc2_max), linetype="dotted", size=0.2) +
geom_hline(yintercept=c(pc3_min, pc3_max), linetype="dotted", size=0.2)
## Warning: Ignoring unknown aesthetics: text
# Draw static ggplot
#g
# Draw dynamic ggplotly
ggplotly(g, tooltip="text") # By default the tooltip would also show coordinates
## Warning in dev_fun(file = tempfile(), width = width %||% 640, height = height %||% : partial argument match of 'file' to 'filename'
# Clean-up
rm(g)
# Prepare ggplot
g <- ggplot(phenotypes.df, aes(PC3,PC4)) +
geom_point(aes(col=group, text=sample_id)) +
labs(title="PC3 vs PC4", x="PC3", y="PC4") +
theme(legend.title=element_blank()) +
myColourScale +
geom_vline(xintercept=c(pc3_min, pc3_max), linetype="dotted", size=0.2) +
geom_hline(yintercept=c(pc4_min, pc4_max), linetype="dotted", size=0.2)
## Warning: Ignoring unknown aesthetics: text
# Draw static ggplot
#g
# Draw dynamic ggplotly
ggplotly(g, tooltip="text") # By default the tooltip would also show coordinates
## Warning in dev_fun(file = tempfile(), width = width %||% 640, height = height %||% : partial argument match of 'file' to 'filename'
# Clean-up
rm(g)
# Prepare ggplot
g <- ggplot(phenotypes.df, aes(PC4,PC5)) +
geom_point(aes(col=group, text=sample_id)) +
labs(title="PC4 vs PC5", x="PC4", y="PC5") +
theme(legend.title=element_blank()) +
myColourScale +
geom_vline(xintercept=c(pc4_min, pc4_max), linetype="dotted", size=0.2) +
geom_hline(yintercept=c(pc5_min, pc5_max), linetype="dotted", size=0.2)
## Warning: Ignoring unknown aesthetics: text
# Draw static ggplot
#g
# Draw dynamic ggplotly
ggplotly(g, tooltip="text") # By default the tooltip would also show coordinates
## Warning in dev_fun(file = tempfile(), width = width %||% 640, height = height %||% : partial argument match of 'file' to 'filename'
# Clean-up
rm(g, myColourScale,
pc1_min, pc1_max, pc2_min, pc2_max, pc3_min, pc3_max, pc4_min, pc4_max, pc5_min, pc5_max)
save.image(paste(base_folder, "s01_add_PCs_1634_715.RData", sep="/"))
ls()
## [1] "base_folder" "genotypes.mx" "phenotypes.df" "variants.df"
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] plotly_4.9.0 ggplot2_3.2.0 dplyr_0.8.1 knitr_1.23
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.1 later_0.8.0 pillar_1.4.1 compiler_3.5.1 tools_3.5.1 digest_0.6.19 jsonlite_1.6 evaluate_0.14 tibble_2.1.3 gtable_0.3.0 viridisLite_0.3.0 pkgconfig_2.0.2 rlang_0.3.4 shiny_1.3.2 crosstalk_1.0.0 yaml_2.2.0 xfun_0.7 withr_2.1.2 stringr_1.4.0 httr_1.4.0 htmlwidgets_1.3 grid_3.5.1 tidyselect_0.2.5 glue_1.3.1 data.table_1.12.2 R6_2.4.0 rmarkdown_1.13 purrr_0.3.2 tidyr_0.8.3 magrittr_1.5 promises_1.0.1 scales_1.0.0 htmltools_0.3.6 assertthat_0.2.1 xtable_1.8-4 mime_0.7 colorspace_1.4-1 httpuv_1.5.1 labeling_0.3 stringi_1.4.3 lazyeval_0.2.2 munsell_0.5.0 crayon_1.3.4
Sys.time()
## [1] "2019-08-30 15:45:00 BST"